library(faraway)
attach(teengamb)
data(teengamb)
head(teengamb)
## sex status income verbal gamble
## 1 1 51 2.00 8 0.0
## 2 1 28 2.50 8 0.0
## 3 1 37 2.00 6 0.0
## 4 1 28 7.00 4 7.3
## 5 1 65 2.00 8 19.6
## 6 1 61 3.47 6 0.1
summary(teengamb)
## sex status income verbal
## Min. :0.0000 Min. :18.00 Min. : 0.600 Min. : 1.00
## 1st Qu.:0.0000 1st Qu.:28.00 1st Qu.: 2.000 1st Qu.: 6.00
## Median :0.0000 Median :43.00 Median : 3.250 Median : 7.00
## Mean :0.4043 Mean :45.23 Mean : 4.642 Mean : 6.66
## 3rd Qu.:1.0000 3rd Qu.:61.50 3rd Qu.: 6.210 3rd Qu.: 8.00
## Max. :1.0000 Max. :75.00 Max. :15.000 Max. :10.00
## gamble
## Min. : 0.0
## 1st Qu.: 1.1
## Median : 6.0
## Mean : 19.3
## 3rd Qu.: 19.4
## Max. :156.0
Sex is a categorical variable
teengamb$sex <-as.factor(teengamb$sex)
A numerical summary divided by sex is useful for understanding patterns if present
by(teengamb, teengamb$sex, summary)
## teengamb$sex: 0
## sex status income verbal gamble
## 0:28 Min. :18.00 Min. : 0.600 Min. : 1.000 Min. : 0.000
## 1: 0 1st Qu.:38.00 1st Qu.: 2.000 1st Qu.: 6.000 1st Qu.: 2.775
## Median :51.00 Median : 3.375 Median : 7.000 Median : 14.250
## Mean :52.00 Mean : 4.976 Mean : 6.821 Mean : 29.775
## 3rd Qu.:65.25 3rd Qu.: 6.625 3rd Qu.: 8.250 3rd Qu.: 42.175
## Max. :75.00 Max. :15.000 Max. :10.000 Max. :156.000
## ------------------------------------------------------------
## teengamb$sex: 1
## sex status income verbal gamble
## 0: 0 Min. :18.00 Min. : 1.500 Min. :4.000 Min. : 0.000
## 1:19 1st Qu.:28.00 1st Qu.: 2.000 1st Qu.:6.000 1st Qu.: 0.100
## Median :30.00 Median : 3.000 Median :6.000 Median : 1.700
## Mean :35.26 Mean : 4.149 Mean :6.421 Mean : 3.866
## 3rd Qu.:43.00 3rd Qu.: 5.750 3rd Qu.:8.000 3rd Qu.: 6.000
## Max. :65.00 Max. :10.000 Max. :8.000 Max. :19.600
pairs(teengamb)
cor(teengamb[,-1])
## status income verbal gamble
## status 1.00000000 -0.2750340 0.5316102 -0.05042081
## income -0.27503402 1.0000000 -0.1755707 0.62207690
## verbal 0.53161022 -0.1755707 1.0000000 -0.22005619
## gamble -0.05042081 0.6220769 -0.2200562 1.00000000
We remove the qualitative variable were correlations cannot be evaluated.
Female
cor(teengamb[teengamb$sex==1,-1])
## status income verbal gamble
## status 1.0000000 -0.4870717 0.33460676 0.36090977
## income -0.4870717 1.0000000 -0.21463814 0.08823560
## verbal 0.3346068 -0.2146381 1.00000000 0.07068478
## gamble 0.3609098 0.0882356 0.07068478 1.00000000
Male
cor(teengamb[teengamb$sex==0,-1])
## status income verbal gamble
## status 1.0000000 -0.3454141 0.6296585 -0.3997019
## income -0.3454141 1.0000000 -0.1837980 0.7136690
## verbal 0.6296585 -0.1837980 1.0000000 -0.3325610
## gamble -0.3997019 0.7136690 -0.3325610 1.0000000
Graphical summary
plot(gamble~sex, teengamb)
We can identify outliers
plot(income~sex)
plot(gamble~income, pch=unclass(sex))
data(uswages)
attach(uswages)
head(uswages)
## wage educ exper race smsa ne mw so we pt
## 6085 771.60 18 18 0 1 1 0 0 0 0
## 23701 617.28 15 20 0 1 0 0 0 1 0
## 16208 957.83 16 9 0 1 0 0 1 0 0
## 2720 617.28 12 24 0 1 1 0 0 0 0
## 9723 902.18 14 12 0 1 0 1 0 0 0
## 22239 299.15 12 33 0 1 0 0 0 1 0
summary(uswages)
## wage educ exper race
## Min. : 50.39 Min. : 0.00 Min. :-2.00 Min. :0.000
## 1st Qu.: 308.64 1st Qu.:12.00 1st Qu.: 8.00 1st Qu.:0.000
## Median : 522.32 Median :12.00 Median :15.00 Median :0.000
## Mean : 608.12 Mean :13.11 Mean :18.41 Mean :0.078
## 3rd Qu.: 783.48 3rd Qu.:16.00 3rd Qu.:27.00 3rd Qu.:0.000
## Max. :7716.05 Max. :18.00 Max. :59.00 Max. :1.000
## smsa ne mw so
## Min. :0.000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.756 Mean :0.229 Mean :0.2485 Mean :0.3125
## 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.000 Max. :1.0000 Max. :1.0000
## we pt
## Min. :0.00 Min. :0.0000
## 1st Qu.:0.00 1st Qu.:0.0000
## Median :0.00 Median :0.0000
## Mean :0.21 Mean :0.0925
## 3rd Qu.:0.00 3rd Qu.:0.0000
## Max. :1.00 Max. :1.0000
Change categorical variables
uswages$race <- as.factor(uswages$race)
uswages$smsa <- as.factor(uswages$smsa)
uswages$ne <- as.factor(uswages$ne)
uswages$mw <- as.factor(uswages$mw)
uswages$so <- as.factor(uswages$so)
uswages$we <- as.factor(uswages$we)
uswages$pt <- as.factor(uswages$pt)
A numerical summary divided by race is useful for understanding patterns if present
by(uswages, uswages$race, summary)
## uswages$race: 0
## wage educ exper race smsa ne
## Min. : 50.39 Min. : 0.0 Min. :-2.00 0:1844 0: 459 0:1408
## 1st Qu.: 315.81 1st Qu.:12.0 1st Qu.: 8.00 1: 0 1:1385 1: 436
## Median : 522.32 Median :12.0 Median :15.00
## Mean : 620.98 Mean :13.2 Mean :18.29
## 3rd Qu.: 795.59 3rd Qu.:16.0 3rd Qu.:27.00
## Max. :7716.05 Max. :18.0 Max. :59.00
## mw so we pt
## 0:1373 0:1314 0:1437 0:1677
## 1: 471 1: 530 1: 407 1: 167
##
##
##
##
## ------------------------------------------------------------
## uswages$race: 1
## wage educ exper race smsa ne
## Min. : 52.23 Min. : 0.00 Min. :-1.00 0: 0 0: 29 0:134
## 1st Qu.: 237.42 1st Qu.:11.75 1st Qu.: 9.75 1:156 1:127 1: 22
## Median : 398.46 Median :12.00 Median :17.50
## Mean : 456.04 Mean :12.11 Mean :19.83
## 3rd Qu.: 641.03 3rd Qu.:14.00 3rd Qu.:27.00
## Max. :2374.15 Max. :18.00 Max. :58.00
## mw so we pt
## 0:130 0:61 0:143 0:138
## 1: 26 1:95 1: 13 1: 18
##
##
##
##
pairs(uswages[,-c(4:10)])
cor(uswages[,-c(4:10)])
## wage educ exper
## wage 1.0000000 0.2483358 0.1832012
## educ 0.2483358 1.0000000 -0.3024788
## exper 0.1832012 -0.3024788 1.0000000
We remove the qualitative variables were correlations cannot be evaluated.
Black
cor(uswages[uswages$race==1,-c(4:10)])
## wage educ exper
## wage 1.0000000 0.1869139 0.1398199
## educ 0.1869139 1.0000000 -0.4211823
## exper 0.1398199 -0.4211823 1.0000000
White
cor(uswages[uswages$race==0,-c(4:10)])
## wage educ exper
## wage 1.0000000 0.2445087 0.1909131
## educ 0.2445087 1.0000000 -0.2918642
## exper 0.1909131 -0.2918642 1.0000000
Graphical summary
plot(educ~race, uswages)
plot(exper~race, uswages)
plot(wage~race, uswages)
We can identify outliers
plot(wage~educ, pch=unclass(race))
plot(wage~exper, pch=unclass(race))
plot(educ~exper, pch=unclass(race))
data(prostate)
attach(prostate)
head(prostate)
## lcavol lweight age lbph svi lcp gleason pgg45 lpsa
## 1 -0.5798185 2.7695 50 -1.386294 0 -1.38629 6 0 -0.43078
## 2 -0.9942523 3.3196 58 -1.386294 0 -1.38629 6 0 -0.16252
## 3 -0.5108256 2.6912 74 -1.386294 0 -1.38629 7 20 -0.16252
## 4 -1.2039728 3.2828 58 -1.386294 0 -1.38629 6 0 -0.16252
## 5 0.7514161 3.4324 62 -1.386294 0 -1.38629 6 0 0.37156
## 6 -1.0498221 3.2288 50 -1.386294 0 -1.38629 6 0 0.76547
summary(prostate)
## lcavol lweight age lbph
## Min. :-1.3471 Min. :2.375 Min. :41.00 Min. :-1.3863
## 1st Qu.: 0.5128 1st Qu.:3.376 1st Qu.:60.00 1st Qu.:-1.3863
## Median : 1.4469 Median :3.623 Median :65.00 Median : 0.3001
## Mean : 1.3500 Mean :3.653 Mean :63.87 Mean : 0.1004
## 3rd Qu.: 2.1270 3rd Qu.:3.878 3rd Qu.:68.00 3rd Qu.: 1.5581
## Max. : 3.8210 Max. :6.108 Max. :79.00 Max. : 2.3263
## svi lcp gleason pgg45
## Min. :0.0000 Min. :-1.3863 Min. :6.000 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.:-1.3863 1st Qu.:6.000 1st Qu.: 0.00
## Median :0.0000 Median :-0.7985 Median :7.000 Median : 15.00
## Mean :0.2165 Mean :-0.1794 Mean :6.753 Mean : 24.38
## 3rd Qu.:0.0000 3rd Qu.: 1.1786 3rd Qu.:7.000 3rd Qu.: 40.00
## Max. :1.0000 Max. : 2.9042 Max. :9.000 Max. :100.00
## lpsa
## Min. :-0.4308
## 1st Qu.: 1.7317
## Median : 2.5915
## Mean : 2.4784
## 3rd Qu.: 3.0564
## Max. : 5.5829
Change categorical variables
prostate$svi <- as.factor(prostate$svi)
A numerical summary divided by svi is useful for understanding patterns if present
by(prostate, prostate$svi, summary)
## prostate$svi: 0
## lcavol lweight age lbph svi
## Min. :-1.3471 Min. :2.375 Min. :41.00 Min. :-1.3863 0:76
## 1st Qu.: 0.3602 1st Qu.:3.310 1st Qu.:60.00 1st Qu.:-1.3863 1: 0
## Median : 1.1616 Median :3.554 Median :64.00 Median : 0.4383
## Mean : 1.0179 Mean :3.624 Mean :63.41 Mean : 0.1655
## 3rd Qu.: 1.6844 3rd Qu.:3.866 3rd Qu.:68.00 3rd Qu.: 1.6438
## Max. : 3.2465 Max. :6.108 Max. :78.00 Max. : 2.3263
## lcp gleason pgg45 lpsa
## Min. :-1.3863 Min. :6.000 Min. : 0.00 Min. :-0.4308
## 1st Qu.:-1.3863 1st Qu.:6.000 1st Qu.: 0.00 1st Qu.: 1.5891
## Median :-1.3863 Median :7.000 Median : 5.00 Median : 2.2345
## Mean :-0.6715 Mean :6.632 Mean :17.63 Mean : 2.1366
## 3rd Qu.:-0.3637 3rd Qu.:7.000 3rd Qu.:26.25 3rd Qu.: 2.8079
## Max. : 2.3273 Max. :9.000 Max. :95.00 Max. : 4.0298
## ------------------------------------------------------------
## prostate$svi: 1
## lcavol lweight age lbph svi
## Min. :1.215 Min. :3.237 Min. :44.00 Min. :-1.3863 0: 0
## 1st Qu.:1.997 1st Qu.:3.582 1st Qu.:62.00 1st Qu.:-1.3863 1:21
## Median :2.661 Median :3.774 Median :68.00 Median :-0.5276
## Mean :2.552 Mean :3.755 Mean :65.52 Mean :-0.1353
## 3rd Qu.:2.907 3rd Qu.:3.897 3rd Qu.:69.00 3rd Qu.: 1.3481
## Max. :3.821 Max. :4.718 Max. :79.00 Max. : 2.0082
## lcp gleason pgg45 lpsa
## Min. :-1.386 Min. :7.00 Min. : 10.00 Min. :2.214
## 1st Qu.: 1.179 1st Qu.:7.00 1st Qu.: 30.00 1st Qu.:3.056
## Median : 1.910 Median :7.00 Median : 50.00 Median :3.565
## Mean : 1.602 Mean :7.19 Mean : 48.81 Mean :3.715
## 3rd Qu.: 2.420 3rd Qu.:7.00 3rd Qu.: 60.00 3rd Qu.:4.130
## Max. : 2.904 Max. :9.00 Max. :100.00 Max. :5.583
pairs(prostate[,-5])
cor(prostate[,-5])
## lcavol lweight age lbph lcp gleason
## lcavol 1.00000000 0.194128387 0.2249999 0.02734971 0.67531058 0.432417052
## lweight 0.19412839 1.000000000 0.3075247 0.43493174 0.10023889 -0.001283003
## age 0.22499988 0.307524741 1.0000000 0.35018592 0.12766778 0.268891599
## lbph 0.02734971 0.434931744 0.3501859 1.00000000 -0.00699944 0.077820444
## lcp 0.67531058 0.100238891 0.1276678 -0.00699944 1.00000000 0.514829912
## gleason 0.43241705 -0.001283003 0.2688916 0.07782044 0.51482991 1.000000000
## pgg45 0.43365224 0.050846195 0.2761124 0.07846000 0.63152807 0.751904512
## lpsa 0.73446028 0.354121818 0.1695929 0.17980950 0.54881316 0.368986693
## pgg45 lpsa
## lcavol 0.4336522 0.7344603
## lweight 0.0508462 0.3541218
## age 0.2761124 0.1695929
## lbph 0.0784600 0.1798095
## lcp 0.6315281 0.5488132
## gleason 0.7519045 0.3689867
## pgg45 1.0000000 0.4223157
## lpsa 0.4223157 1.0000000
We remove the qualitative variables were correlations cannot be evaluated.
With seminal vesicle invasion
cor(prostate[prostate$svi==1,-5])
## lcavol lweight age lbph lcp gleason
## lcavol 1.0000000 0.2016764497 -0.24688623 -0.14899899 0.53983778 -0.1330999
## lweight 0.2016764 1.0000000000 0.23071571 0.08582420 0.19705639 -0.1409462
## age -0.2468862 0.2307157098 1.00000000 0.33635108 -0.03212035 0.1679336
## lbph -0.1489990 0.0858242041 0.33635108 1.00000000 -0.05658159 0.3176501
## lcp 0.5398378 0.1970563913 -0.03212035 -0.05658159 1.00000000 0.1004554
## gleason -0.1330999 -0.1409461563 0.16793362 0.31765006 0.10045543 1.0000000
## pgg45 -0.1155055 -0.0003723927 0.31423433 0.29951832 0.27429066 0.5967197
## lpsa 0.4721634 0.0122011324 -0.31466981 -0.18892183 0.06489636 -0.3210767
## pgg45 lpsa
## lcavol -0.1155055281 0.47216337
## lweight -0.0003723927 0.01220113
## age 0.3142343310 -0.31466981
## lbph 0.2995183241 -0.18892183
## lcp 0.2742906649 0.06489636
## gleason 0.5967197325 -0.32107670
## pgg45 1.0000000000 -0.22100690
## lpsa -0.2210068987 1.00000000
Without seminal vesicle invasion
cor(prostate[prostate$svi==0,-5])
## lcavol lweight age lbph lcp gleason
## lcavol 1.0000000 0.157723389 0.2768497 0.12484824 0.50355027 0.39252917
## lweight 0.1577234 1.000000000 0.3176750 0.50781401 0.01090040 -0.02433415
## age 0.2768497 0.317675007 1.0000000 0.37245238 0.09440620 0.26610078
## lbph 0.1248482 0.507814006 0.3724524 1.00000000 0.09986765 0.06892492
## lcp 0.5035503 0.010900396 0.0944062 0.09986765 1.00000000 0.50262560
## gleason 0.3925292 -0.024334155 0.2661008 0.06892492 0.50262560 1.00000000
## pgg45 0.3169860 0.001492569 0.2333695 0.09129667 0.55197656 0.74941027
## lpsa 0.6495756 0.416117460 0.2472757 0.38765367 0.32929037 0.36236698
## pgg45 lpsa
## lcavol 0.316985985 0.6495756
## lweight 0.001492569 0.4161175
## age 0.233369469 0.2472757
## lbph 0.091296671 0.3876537
## lcp 0.551976564 0.3292904
## gleason 0.749410267 0.3623670
## pgg45 1.000000000 0.3392641
## lpsa 0.339264111 1.0000000
Graphical summary
plot(age~svi, prostate)
plot(lweight~svi, prostate)
plot(lweight~age, prostate)
plot(lcavol~age, prostate)
data(sat)
attach(sat)
## The following object is masked from teengamb:
##
## verbal
summary(sat)
## expend ratio salary takers
## Min. :3.656 Min. :13.80 Min. :25.99 Min. : 4.00
## 1st Qu.:4.882 1st Qu.:15.22 1st Qu.:30.98 1st Qu.: 9.00
## Median :5.768 Median :16.60 Median :33.29 Median :28.00
## Mean :5.905 Mean :16.86 Mean :34.83 Mean :35.24
## 3rd Qu.:6.434 3rd Qu.:17.57 3rd Qu.:38.55 3rd Qu.:63.00
## Max. :9.774 Max. :24.30 Max. :50.05 Max. :81.00
## verbal math total
## Min. :401.0 Min. :443.0 Min. : 844.0
## 1st Qu.:427.2 1st Qu.:474.8 1st Qu.: 897.2
## Median :448.0 Median :497.5 Median : 945.5
## Mean :457.1 Mean :508.8 Mean : 965.9
## 3rd Qu.:490.2 3rd Qu.:539.5 3rd Qu.:1032.0
## Max. :516.0 Max. :592.0 Max. :1107.0
pairs(sat)
cor(sat)
## expend ratio salary takers verbal math
## expend 1.0000000 -0.371025386 0.869801513 0.5926274 -0.41004987 -0.34941409
## ratio -0.3710254 1.000000000 -0.001146081 -0.2130536 0.06376664 0.09542173
## salary 0.8698015 -0.001146081 1.000000000 0.6167799 -0.47696364 -0.40131282
## takers 0.5926274 -0.213053607 0.616779867 1.0000000 -0.89326296 -0.86938393
## verbal -0.4100499 0.063766636 -0.476963635 -0.8932630 1.00000000 0.97025604
## math -0.3494141 0.095421730 -0.401312817 -0.8693839 0.97025604 1.00000000
## total -0.3805370 0.081253823 -0.439883381 -0.8871187 0.99150325 0.99350238
## total
## expend -0.38053700
## ratio 0.08125382
## salary -0.43988338
## takers -0.88711868
## verbal 0.99150325
## math 0.99350238
## total 1.00000000
High correlation between salary vs expenditure, verbal vs math, verbal vs total, math vs total
we can observe those strong correlation garaphically
plot(expend~salary)
plot(verbal~math)
plot(verbal~total)
plot(math~total)
data(divusa)
attach(divusa)
head(divusa)
## year divorce unemployed femlab marriage birth military
## 1 1920 8.0 5.2 22.70 92.0 117.9 3.2247
## 2 1921 7.2 11.7 22.79 83.0 119.8 3.5614
## 3 1922 6.6 6.7 22.88 79.7 111.2 2.4553
## 4 1923 7.1 2.4 22.97 85.2 110.5 2.2065
## 5 1924 7.2 5.0 23.06 80.3 110.9 2.2889
## 6 1925 7.2 3.2 23.15 79.2 106.6 2.1735
Numerical summary
summary(divusa)
## year divorce unemployed femlab
## Min. :1920 Min. : 6.10 Min. : 1.200 Min. :22.70
## 1st Qu.:1939 1st Qu.: 8.70 1st Qu.: 4.200 1st Qu.:27.47
## Median :1958 Median :10.60 Median : 5.600 Median :37.10
## Mean :1958 Mean :13.27 Mean : 7.173 Mean :38.58
## 3rd Qu.:1977 3rd Qu.:20.30 3rd Qu.: 7.500 3rd Qu.:47.80
## Max. :1996 Max. :22.80 Max. :24.900 Max. :59.30
## marriage birth military
## Min. : 49.70 Min. : 65.30 Min. : 1.940
## 1st Qu.: 61.90 1st Qu.: 68.90 1st Qu.: 3.469
## Median : 74.10 Median : 85.90 Median : 9.102
## Mean : 72.97 Mean : 88.89 Mean :12.365
## 3rd Qu.: 80.00 3rd Qu.:107.30 3rd Qu.:14.266
## Max. :118.10 Max. :122.90 Max. :86.641
cor(divusa)
## year divorce unemployed femlab marriage
## year 1.000000000 0.87923753 -0.2344792 0.98598207 -0.6173255
## divorce 0.879237534 1.00000000 -0.2106019 0.91039698 -0.5342554
## unemployed -0.234479195 -0.21060188 1.0000000 -0.25746176 -0.2707630
## femlab 0.985982068 0.91039698 -0.2574618 1.00000000 -0.6486273
## marriage -0.617325533 -0.53425537 -0.2707630 -0.64862728 1.0000000
## birth -0.576313991 -0.72192425 -0.3138890 -0.60409490 0.6737273
## military 0.007267171 0.01857483 -0.4002930 0.05126339 0.2581983
## birth military
## year -0.5763140 0.007267171
## divorce -0.7219242 0.018574832
## unemployed -0.3138890 -0.400292954
## femlab -0.6040949 0.051263390
## marriage 0.6737273 0.258198260
## birth 1.0000000 0.140898643
## military 0.1408986 1.000000000
Graphical summary
pairs(divusa)
divusa$year<-as.character(divusa$year)
plot(marriage~year,divusa)
plot(femlab~year,divusa)